/* ========================================================================== */
/* ======================= PEER FX - DATA PREPARATION ======================= */
/* ========================================================================== */

*** Initial information and settings

clear all
version 14.2
set matsize 10000


* Set directory: set the main project root directory here,
* /Data/ folder is a first-level subdirectory
global maindir "/home/someuser/somefolder/mainprojectdir"

* Load the data
use "$maindir/Data/specialed_data.dta" , clear


*** Restrictions

* Keep SW8 only
keep if swtyp=="SW8"

* Drop special schools
drop if spezschule==.
drop if spezschule == 1


*** Manually correct the track

* Indicator for Real
gen real =  regexm(klasse,"eal")
replace real=1 if regexm(klasse,"[Rr]([a-zA-Z]*)?[0-9]([a-zA-Z]*)?")
replace real=1 if regexm(klasse,"[0-9][Rr](.)?")

* Indicator for Sek
gen sek = regexm(klasse,"[sS]ek")
replace sek=1 if regexm(klasse,"[sS][0-9]")
replace sek=1 if regexm(klasse,"[0-9][Ss]")

* Indikator for KK
gen kk = regexm(klasse,"[kK][kK]")
replace kk = 1 if regexm(klasse,"leinklas")
replace kk = 1 if regexm(klasse,"BNR[0-9]?")
replace kk = 1 if regexm(klasse,"[0-9]rk")

* Indicator for UG
gen ug = regexm(schule,"ymnasi")
replace ug = 1 if regexm(klasse,"[uU][gG]")

* Generate new correct variable
gen tracktyp =.
replace tracktyp= 1 if  real==1 & sek ==0 & ug==0  & kk==0
replace tracktyp= 2 if  sek ==1 & real==0 & ug==0  & kk==0
replace tracktyp= 0 if  kk  ==1 & real==0 & sek==0 & ug==0
replace tracktyp= 3 if  ug  ==1 & real==0 & sek==0 & kk==0
replace tracktyp= 1 if  klassentypfix==1 & tracktyp==.
replace tracktyp= 2 if  klassentypfix==2 & tracktyp==.
replace tracktyp= 0 if  klassentypfix==0 & tracktyp==.
replace tracktyp= 3 if  klassentypfix==3 & tracktyp==.
label variable tracktyp "0 KK, 1 Real, 2 Sek, UG"
label def tracklab 0 "KK" 1 "Real" 2 "Sek" 3 "UG"
label values tracktyp tracklab
order tracktyp, before(klassentyp)
drop if tracktyp==.
tab klassentypfix tracktyp

* Recode tracks
gen hightrack = .
replace hightrack=1 if tracktyp==2
replace hightrack=1 if tracktyp==3
replace hightrack=0 if tracktyp==0
replace hightrack=0 if tracktyp==1
tab hightrack, miss
drop if hightrack==.
order hightrack, after(klassentypfix)


*** Cohort sizes (determined by school-track-year)

egen cohort = group(swtyp schule swjahr hightrack)
bys cohort: gen cohortsize = _N

* Focus on pre-determined SN diagnoses
generate ageatfirstreg = round((pddatumanmeldungfirst - geburtsdatum)/365.25, .01)
replace ageatfirstreg = . if (ageatfirstreg <0  & ageatfirstreg!=.)
replace sn = 0 if (ageatfirstreg >= (swalter-3) & ageatfirstreg!=.)
replace sn = 0 if (ageatfirstreg > 12 & ageatfirstreg!=.)
replace sn = 0 if ageatfirstreg == .

* Generate treatment and drop if missing treatment
tab sn, miss
drop if sn==.
gen snchild = sn
replace pdanzahlaktionen=. if pdregid!="Stadt"
generate intsnchild = .
replace  intsnchild = 0 if snchild==0
replace  intsnchild = pdanzahlkontakte if pdanzahlkontakte!=. & intsnchild==.
replace  intsnchild = pdanzahlaktionen if pdanzahlaktionen!=. & intsnchild==.
replace snchild=0    if pdanzahldiagnosen==0
replace intsnchild=0 if pdanzahldiagnosen==0
replace snchild=0    if intsnchild<2
replace intsnchild=0 if intsnchild<2
replace intsnchild=0 if snchild==0
bysort classfix: egen total = total(snchild)
generate shsnchildren = ((total - snchild) / (clsizefix - 1))
generate snchildren   = (total  - snchild)
drop total
order snchild shsnchildren* snchildren* , after(sn)
replace ageatfirstreg = . if snchild==0

* Generate alternative treatment: at least one SN peer in the class
gen atleastonesn = 0
replace atleastonesn = 1 if shsnchildren>0
tab atleastonesn

* Cohort-level treatment
foreach var of varlist snchild intsnchild {
bysort cohort: egen total = total(`var')
generate sshare_`var' = ((total - `var') / (cohortsize - 1))
generate slevel_`var' = (total  - `var')
drop total
}
order sshare* , after(snchildren)

* Domestic violence shares (cohort)
gen famprob = 0 if snchild!=.
replace famprob = 1 if regexm(pdanmeldegrund1, "Familie.*") | ///
                       regexm(pdanmeldegrund2, "Familie")   | ///
                       regexm(pdanmeldegrund3, "Familie")
replace famprob = 1 if regexm(pdbemerkungen,"[gG]ewalt|missbrauch|missha|Schläge")
for var pdcdiagnose*: replace famprob = 1 if regexm(lower(X),"(misshandlung.*)")
replace famprob=0 if snchild==0
bysort cohort: egen total = total(famprob)
generate shfamprob = ((total - famprob) / (cohortsize - 1))
drop total

* Drop if missing fixed-effects
drop if schule ==""
drop if klassentypfix==.
drop if swjahr==.

* Drop if missing covariates
rename swalter age
drop if age==.
gen oldattest = cond(age>16,1,0) if age!=.
rename geschlecht female
drop if female==.
gen native = 1-fremdsprachig
drop if native==.
order native, after(fremdsprachig)

* Drop individuals with missing outcome
drop if pptmathematik==.
drop if pptdeutsch==.

* Age restrictions
keep if age>11 & age<18

* Class restrictions (high ISF share and segregated classes)
bysort classfix: egen total = total(isf)
generate shareisf = (total / clsizefix)
drop total
drop if classfix == 186
drop if classfix == 337
drop if classfix == 623
drop if classfix == 679
drop if classfix == 692
drop if classfix == 1052
drop if classfix == 2025
drop if classfix == 2122
drop if classfix == 2386
drop if classfix == 2815
drop if classfix == 2836
drop if classfix == 2992
drop if classfix == 3189
drop if classfix == 3281
drop shareisf
drop if classfix == 1058
drop if classfix == 1764 
drop if classfix == 1781 
drop if classfix == 1856 
drop if classfix == 1959 
drop if classfix == 2004 
drop if classfix == 2489 
drop if classfix == 2647 
drop if classfix == 3195 

* Class size restrictions and drop classes with 100% SN
tab clsizefix, miss
drop if clsizefix==.
drop if clsizefix < 10
drop if clsizefix > 31

* Drop classes with low coverage
bys classfix: gen newclsize=_N
drop if newclsize < 10
drop newclsize


*** Outcome (composite, math and German as robustness)

egen composite = rowmean(pptmathematik pptdeutsch)
order composite, after(pptdeutsch)
egen pptntwiss = rowmean(pptntchemie pptntphysik pptntbiologie)
order pptntwiss, after(pptntbiologie)
foreach y of varlist pptmathematik pptdeutsch composite pptenglisch pptntwiss {
	egen  std_`y' = std(`y')
}
order std* , after(pptvorstellung)


*** Severity and Type of SN

* sn child and intensity (number of contacts)
tab snchild
bysort classfix: egen intsnchildren = total(intsnchild)
order intsnchild intsnchildren, after(snchildren)

* Develop a restricted definition to all who are disruptive (anmeldegrund1-3)
gen disruptive = 0 if snchild!=.
replace disruptive = 1 ///
	if(pdanmeldegrund1=="Emotional- und Sozialverhalten" ///
	 | pdanmeldegrund1=="SAP_Gewalt im Schulzimmer/Schulhaus" ///
	 | pdanmeldegrund1=="SAP_Verhalten" ///
	 | pdanmeldegrund1=="SPD_Emotional- und Sozialverhalten" ///
	 | pdanmeldegrund1=="Verhalten")
replace disruptive = 1 ///
	if(pdanmeldegrund2=="Emotional- und Sozialverhalten" ///
	 | pdanmeldegrund2=="SAP_Gewalt im Schulzimmer/Schulhaus" ///
	 | pdanmeldegrund2=="SAP_Verhalten" ///
	 | pdanmeldegrund2=="SPD_Emotional- und Sozialverhalten" ///
	 | pdanmeldegrund2=="Verhalten")
replace disruptive = 1 ///
	if(pdanmeldegrund3=="Emotional- und Sozialverhalten" ///
	 | pdanmeldegrund3=="SAP_Gewalt im Schulzimmer/Schulhaus" ///
	 | pdanmeldegrund3=="SAP_Verhalten" ///
	 | pdanmeldegrund3=="SPD_Emotional- und Sozialverhalten" ///
	 | pdanmeldegrund3=="Verhalten")
replace disruptive = 0 if snchild==0
bysort classfix: egen total = total(disruptive)
generate shdisruptive = ((total - disruptive) / (clsizefix - 1))
generate disrchildren = (total  - disruptive)
drop total
order disruptive shdisruptive disrchildren, after(intsnchildren)

* Develop a restricted definition to all who have learning problems (anmeldegrund1-3)
gen learnproblem = 0 if snchild!=.
replace learnproblem = 1 ///
	if(pdanmeldegrund1=="Leistungs- und Lernverhalten" ///
	 | pdanmeldegrund1=="SPD_Leistungs- und Lernverhalten" ///
	 | pdanmeldegrund1=="Schulleistungen")
replace learnproblem = 1 ///
	if(pdanmeldegrund2=="Leistungs- und Lernverhalten" ///
	 | pdanmeldegrund2=="SPD_Leistungs- und Lernverhalten" ///
	 | pdanmeldegrund2=="Schulleistungen")
replace learnproblem = 1 ///
	if(pdanmeldegrund3=="Leistungs- und Lernverhalten" ///
	 | pdanmeldegrund3=="SPD_Leistungs- und Lernverhalten" ///
	 | pdanmeldegrund3=="Schulleistungen")
replace learnproblem = 0 if snchild==0
gen learntemp = learnproblem - disruptive
drop learnproblem
rename learntemp learnproblem
recode learnproblem (-1=0)
bysort classfix: egen total = total(learnproblem)
generate shlearn       = ((total - learnproblem) / (clsizefix - 1))
generate learnchildren = (total  - learnproblem)
drop total
order learnproblem shlearn learnchildren, after(disrchildren)


*** Fixed-effects

egen schoolid                = group(schule)                        , label
egen schoolxyear             = group(swjahr schule)                 , label
egen schoolxtrack            = group(schule hightrack)              , label
egen schoolxtrackxyear       = group(swjahr schule hightrack)       , label


*** Controls (age continuous, age in dummies, gender, native, class size)

foreach var of varlist female native age {
    bysort classfix: egen total = total(`var')
    bysort classfix: gen count = _N
    generate clmean`var' = ((total - `var') / (count - 1))
    drop total count
}
order clmean*, after(geschlecht)


*** Cohort-level covariates

foreach var of varlist female native age {
    bysort cohort: egen total = total(`var')
    bysort cohort: gen count = _N
    generate slmean`var' = ((total - `var') / (count - 1))
    drop total count
}
order slmean* , after(clmeanage)


*** Generate additional intensity measures (topsn and contacts per class in dummies)

* Top SN child (top 25%)
egen qtsnchild = xtile(intsnchild) if snchild==1, nq(4)
gen topsnchild = cond(qtsnchild==4,1,0) if snchild!=.
bysort classfix: egen total = total(topsnchild)
generate shtopsn       = ((total - topsnchild) / (clsizefix - 1))
generate topsnchildren = total - topsnchild
drop total

* Share of SN children from a specific quartile of intensity
qui tab qtsnchild, gen(intsnchildqt_)
forvalues q = 1(1)4 {
bysort classfix: egen total = total(`q')
bysort classfix: egen intsnchildqtsum_`q' = total(intsnchildqt_`q')
gen shintsnchildren_`q' = (intsnchildqtsum_`q' / (clsizefix - 1))
drop total
}
drop qtsnchild intsnchildqt_*
order topsnchild shtopsn topsnchildren intsnchildqtsum_* shintsnchildren_*, after(learnchildren)


*** Mid-term outcomes (apprenticeship and high school), income, and unemployment)

* Categorical outcome (1: no post compulsory, 2: VET, 3: high school)
gen educhoice = .
replace educhoice = 1 if (msmatura!=1 & bslehre!=1)
replace educhoice = 2 if (msmatura!=1 & bslehre==1)
replace educhoice = 3 if (msmatura==1 & bslehre!=1)
replace educhoice = 2 if (msmatura==1 & bslehre==1)

* Outcome: found apprenticeship or went to high school
gen postcompulsory = .
replace postcompulsory = 0 if educhoice==1
replace postcompulsory = 1 if (msmatura==1 | bslehre==1)

* Outcome: went to high school, counter-factual is no post-compulsory education
gen highschool = .
replace highschool = 0 if (educhoice==1 | educhoice==2)
replace highschool = 1 if educhoice==3

* Outcome: found an apprenticeship, counter-factual is no post-compulsory education
gen apprentice = .
replace apprentice = 0 if (educhoice==1 | educhoice==3)
replace apprentice = 1 if educhoice ==2

* Outcome: ACA vs VET
gen vetvsaca = cond(educhoice==2,1,0) if (educhoice!=1 & educhoice!=.)

* Outcome: VET quality
gen vetquality = cond(bsabschlussefzon==1,1,0) if apprentice==1

* Outcome: higher apprenticeship wage
egen happwage = rowmax(bslohn1jahr1 bslohn2jahr1 bslohn3jahr1 bslohn4jahr1 ///
					   bslohn1jahr2 bslohn2jahr2 bslohn3jahr2 bslohn4jahr2 ///
					   bslohn1jahr3 bslohn2jahr3 bslohn3jahr3 bslohn4jahr3)
gen lnappwage = ln(happwage)
replace lnappwage = . if apprentice==0

order educhoice postcompulsory highschool apprentice vetvsaca vetquality happwage lnappwage, before(schoolxyear)


*** Long-term outcomes (income and unemployment)

* Generate employment outcome variable (for now one per year)
foreach j of numlist 2007(1)2016 {
	gen employed`j' = cond(bfsdduree`j'==12,1,0) if bfsdduree`j'!=.
}

* Generate income variable (average monthly income, per year)
foreach j of numlist 2007(1)2016 {
	gen monthlyinc`j' = bfsmrevsom`j'/bfsdduree`j'
}

* Generate average/last/highest months employed per year
egen avg_employed     = rowmean(bfsdduree*)
egen last_employed    = rowlast(bfsdduree*)
egen highest_employed =  rowmax(bfsdduree*)

* Generate average/last/highest earnings
egen avg_monthly     = rowmean(monthlyinc*)
egen last_monthly    = rowlast(monthlyinc*)
egen highest_monthly =  rowmax(monthlyinc*)


*** Clean up the data set (just cosmetics)

drop spezschule mittelschule klassentyp klassentypfillold klassentypfillimpmode ///
	 klassentypfillimpklasse klassentypbak klassentypold pptnaturundtechnik ppttlv pptvorstellung ///
	 class clsize classmode clsizemode classklasse clsizeklasse nschule plznschule ///
	 pdfallidreihe pdanzahldiagnosen pdanzahlaktionen pdaktionbez1 ///
	 pdaktionbez2 pdaktionbez3 pdaktionbez4 pdaktionbez5 pdaktionbezreihe pdmassnahmereihe ///
	 pdschuljahrfirst pdfamilienid pdanzahlkontakte pdartberatungskontakt1 pdartberatungskontakt2 ///
	 pdartberatungskontakt3 pdartberatungskontakt4 pdartberatungskontakt5 pdartberatungskontakt6 ///
	 pdartberatungskontaktreihe pdschulgemeindefirst pdalleempfehlungen ///
	 pdklassenzuteilung pdallepdhilfen pdpdhilfe1 pdpdhilfe2 pdpdhilfe3 pdpdhilfe4 pdpdhilfeandere ///
	 pdpdhilfelogopd pdpdhilfelega pdallepsychhilfen pdpsychhilfe1 pdpsychhilfeandere ///
	 pdallezuweisungen pdsonderschule pdsonderschuletypfirst pdrueckschulung pdtherapeutin ///
	 pdgeschwisterbez pdsprache pdiqson1 pdiqson2 pdiqson3 pdiqkabc1 pdiqkabc2 pdiqkabc3 ///
	 pdiqhawik41 pdiqhawik42 pdiqhawik31 pdiqhawik32 pdiqhawik33 pdiqraven1 pdiqraven2 pdiqraven3 ///
	 pdiqpsb1 pdiqpsb2 pdiqkramer1 pdiqkramer2 pdiqkramer3 pdiqcft1 pdiqcft2 pdiqcft3 pdiqcft4 ///
	 pdbemerkungen msplz bfsid2 bfsid3 ///
	 nstatschgeschlechtdom nstatschfreqtot nstatschfreqm nstatschfreqf nstatsaugeschlechtdom ///
	 nstatsaufreqtot nstatsaufreqm nstatsaufreqf nstatsusgeschlechtdom nstatsusfreqtot nstatsusfreqm ///
	 nstatsusfreqf nstatsapigeschlecht nstatsinternet


*** Save and export

compress
save "$maindir/Data/peerfxdata.dta", replace
export delimited using "$maindir/Data/peerfxdata.csv", delim(";") replace
